This report includes the automated proecesses of data verification, validation, quality assessment, and preliminary analysis for a Water Quality Program’s Black Lake Pollution Identification and Correction Project. QA related to proper field and laboratory processes are conducted independently of this report. Code is included for transparency and reproducibility.
If the decision to accept or reject a value is unclear, the option most protective of public health will be chosen.
library(ggplot2)
library(plotly)
library(plyr)
library(flexdashboard)
library(tidyverse)
library(lubridate)
library(leaflet)
library(htmltools)
library(leaflet.extras)
library(leaflet.providers)
library(datasets)
library(sf)
library(crosstalk)
library(dplyr)
library(reactable)
library(mapview)
library(httr)
library(bslib)
library(xfun)
#Plotly
devtools::install_github("ropensci/plotly")
library(plotly)
library(RCurl)
#df <- read.csv("https://raw.githubusercontent.com/AlyssaPeter/R-Code-Sample/main/EIM_Black_Lake_PIC_Project_Data.csv", stringsAsFactors = TRUE, fileEncoding = "ISO-8859-1")
df <- read.csv("O:/EH_Health/Surface Water/+ PIC/Projects/Black Lake Grant 2022-25/Data/EIM_Black_Lake_PIC_Project_Data.csv", stringsAsFactors=FALSE, fileEncoding="latin1")
df #load dataframe
Basic Checks Check the dataframe for null values.
null_values <- sapply(df, function(x) sum(is.na(x)))
null_values
## Ã.........Study_ID Location_ID
## 0 0
## Study_Specific_Location_ID Field_Collection_Type
## 0 0
## Field_Collector Field_Collection_Start_Date
## 0 0
## Field_Collection_Comment Latitude_Decimal_Degrees
## 0 0
## Longitude_Decimal_Degrees Sample_ID
## 0 0
## Sample_Field_Replicate_ID Sample_Replicate_Flag
## 0 0
## Sample_Composite_Flag Sample_Matrix
## 0 0
## Sample_Source Sample_Collection_Method
## 0 0
## Sample_Preparation_Method Result_Parameter_Name
## 532 0
## Result_Parameter_CAS_Number Lab_Analysis_Date
## 0 0
## Result_Value Result_Value_Units
## 0 0
## Result_Reporting_Limit Result_Reporting_Limit_Type
## 0 532
## Result_Detection_Limit Result_Detection_Limit_Type
## 0 0
## Result_Data_Qualifier Fraction_Analyzed
## 0 0
## Result_Method Result_Lab_Name
## 0 0
#Sometimes values are input to data table with trailing spaces. Ensure that all columns have valid values.
#For example:
unique(df$Study_Specific_Location_ID)
## [1] "BLA001" "BLA003" "BLA004" "BLA005" "BLA006" "BLA007"
## [7] "BLA002" "BLA008" "BLA009" "BLA011" "BLA00201" "BLA00202"
## [13] "BLA00203" "BLA00301" "BLA00204" "BLA00501" "BLA00502" "BLA00701"
## [19] "BLA00702" "BLA00703" "BLA00704" "BLA00705" "BLA00706" "BLA00707"
## [25] "BLA00708" "BLA00709" "BLA00102" "BLA00103" "BLA00104" "BLA00101"
## [31] "BLA00105" "BLA00902" "BLA00903" "BLA00904" "BLA00905" "BLA00601"
## [37] "BLA00602" "BLA00710" "BLA00712"
# [1] "BLA001" "BLA003" "BLA004" "BLA005" "BLA006" "BLA007" "BLA002" "BLA008" "BLA009" "BLA00201" "BLA00202"
# [12] "BLA00203" "BLA00204" "BLA00301" "BLA00501" "BLA00502" "BLA00701" "BLA00702" "BLA00703" "BLA00704" "BLA00705" "BLA00706"
# [23] "BLA00707" "BLA00708" "BLA00709" "BLA00102" "BLA00103" "BLA00104" "BLA00101" "BLA00105" "BLA007 "
#I can see that the the column Study Specific Location ID has a repeat site ID of BLA007 with a trailing space. I will then identify the record(s) with the trailing space and fix them.
df$Study_Specific_Location_ID <- trimws(df$Study_Specific_Location_ID, which = c("right"))
unique(df$Study_Specific_Location_ID) #check again, BLA007 with trailing space has been corrected
## [1] "BLA001" "BLA003" "BLA004" "BLA005" "BLA006" "BLA007"
## [7] "BLA002" "BLA008" "BLA009" "BLA011" "BLA00201" "BLA00202"
## [13] "BLA00203" "BLA00301" "BLA00204" "BLA00501" "BLA00502" "BLA00701"
## [19] "BLA00702" "BLA00703" "BLA00704" "BLA00705" "BLA00706" "BLA00707"
## [25] "BLA00708" "BLA00709" "BLA00102" "BLA00103" "BLA00104" "BLA00101"
## [31] "BLA00105" "BLA00902" "BLA00903" "BLA00904" "BLA00905" "BLA00601"
## [37] "BLA00602" "BLA00710" "BLA00712"
# [1] "BLA001" "BLA003" "BLA004" "BLA005" "BLA006" "BLA007" "BLA002" "BLA008" "BLA009" "BLA00201" "BLA00202"
# [12] "BLA00203" "BLA00204" "BLA00301" "BLA00501" "BLA00502" "BLA00701" "BLA00702" "BLA00703" "BLA00704" "BLA00705" "BLA00706"
# [23] "BLA00707" "BLA00708" "BLA00709" "BLA00102" "BLA00103" "BLA00104" "BLA00101" "BLA00105"
#repeat for any other variables of concern
unique(df$Result_Parameter_Name)
## [1] "Total Phosphorus" "E. coli"
# [1] "E.coli" "Total Nitrogen " "Total Phosphorus" #TN has a trailing space that needs to be fixed
Spatial Errors Check if Latitude and Longitude are the same per each Study_Specific_Location_ID. If the return is “TRUE”, then results are okay.
location_coords_consistency <- df %>%
group_by(Study_Specific_Location_ID) %>%
summarise(
consistent_latitude = n_distinct(Latitude_Decimal_Degrees) == 1,
consistent_longitude = n_distinct(Longitude_Decimal_Degrees) == 1
)
location_coords_consistency
## # A tibble: 39 × 3
## Study_Specific_Location_ID consistent_latitude consistent_longitude
## <chr> <lgl> <lgl>
## 1 BLA001 TRUE TRUE
## 2 BLA00101 TRUE TRUE
## 3 BLA00102 TRUE TRUE
## 4 BLA00103 TRUE TRUE
## 5 BLA00104 TRUE TRUE
## 6 BLA00105 TRUE TRUE
## 7 BLA002 TRUE TRUE
## 8 BLA00201 TRUE TRUE
## 9 BLA00202 TRUE TRUE
## 10 BLA00203 TRUE TRUE
## # ℹ 29 more rows
FALSE values are observed for some sites. First, define correct values.
# Define the mappings for Latitude and Longitude by Study_Specific_Location_ID
latitude_mapping <- c(
BLA001 = 46.999244, BLA002 = 46.988902, BLA003 = 46.9891, BLA004 = 46.980352,
BLA005 = 46.9794, BLA006 = 47.00784, BLA007 = 46.980353, BLA008 = 47.009801,
BLA009 = 46.996677, BLA00201 = 46.98639, BLA00202 = 46.98759, BLA00203 = 46.99045,
BLA00204 = 46.993164, BLA00301 = 46.98804, BLA00501 = 46.979515, BLA00502 = 46.977944, BLA00701 = 46.98082,
BLA00702 = 46.97965, BLA00703 = 46.97973, BLA00704 = 46.97975, BLA00705 = 46.97927, BLA00706 = 46.97915,
BLA00707 = 46.97786, BLA00708 = 46.97975, BLA00709 = 46.98072, BLA00101 = 47.00011, BLA00102 = 46.9989607,
BLA00103 = 47.0011206, BLA00104 = 47.0017544, BLA00105 = 47.0034, BLA00601 = 47.0088, BLA00902 = 46.9945, BLA00903 = 46.99592, BLA00904 = 46.99712, BLA00905 = 46.99808
)
longitude_mapping <- c(
BLA001 = -122.971982, BLA002 = -122.964812, BLA003 = -122.96155, BLA004 = -122.975222,
BLA005 = -122.9714, BLA006 = -122.97618, BLA007 = -122.9882787, BLA008 = -122.965046,
BLA009 = -122.984555, BLA00201 = -122.97224, BLA00202 = -122.96935, BLA00203 = -122.96175,
BLA00204 = -122.957002, BLA00301 = -122.96062, BLA00501 = -122.971299, BLA00502 = -122.968398, BLA00701 = -122.98925,
BLA00702 = -122.99265, BLA00703 = -122.99271, BLA00704 = -122.99252, BLA00705 = -122.99153, BLA00706 = -122.99149,
BLA00707 = -122.99137, BLA00708 = -122.99429, BLA00709 = -122.98933, BLA00101 = -122.96937, BLA00102 = -122.9687191,
BLA00103 = -122.965938, BLA00104 = -122.9657841, BLA00105 = -122.96321, BLA00601 = -122.97644, BLA00902 = -122.98056, BLA00903 = -122.9839, BLA00904 = -122.98515, BLA00905 = -122.98692
)
Update the lat/long to the defined values:
df$Latitude_Decimal_Degrees <- ifelse(df$Study_Specific_Location_ID %in% names(latitude_mapping), latitude_mapping[df$Study_Specific_Location_ID], df$Latitude_Decimal_Degrees)
df$Longitude_Decimal_Degrees <- ifelse(df$Study_Specific_Location_ID %in% names(longitude_mapping), longitude_mapping[df$Study_Specific_Location_ID], df$Longitude_Decimal_Degrees)
Create a quick map to check visually that coordinates are now correct.
df$Longitude_Decimal_Degrees = as.numeric(as.character(df$Longitude_Decimal_Degrees)) #lat/long as numeric
map <- df %>%
select(Study_Specific_Location_ID, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees)
map <- print(distinct(map))
## Study_Specific_Location_ID Latitude_Decimal_Degrees
## 1 BLA001 46.99924
## 2 BLA003 46.98910
## 3 BLA004 46.98035
## 4 BLA005 46.97940
## 5 BLA006 47.00784
## 6 BLA007 46.98035
## 7 BLA002 46.98890
## 8 BLA008 47.00980
## 9 BLA009 46.99668
## 10 BLA011 46.92828
## 11 BLA00201 46.98639
## 12 BLA00202 46.98759
## 13 BLA00203 46.99045
## 14 BLA00301 46.98804
## 15 BLA00204 46.99316
## 16 BLA00501 46.97951
## 17 BLA00502 46.97794
## 18 BLA00701 46.98082
## 19 BLA00702 46.97965
## 20 BLA00703 46.97973
## 21 BLA00704 46.97975
## 22 BLA00705 46.97927
## 23 BLA00706 46.97915
## 24 BLA00707 46.97786
## 25 BLA00708 46.97975
## 26 BLA00709 46.98072
## 27 BLA00102 46.99896
## 28 BLA00103 47.00112
## 29 BLA00104 47.00175
## 30 BLA00101 47.00011
## 31 BLA00105 47.00340
## 32 BLA00902 46.99450
## 33 BLA00903 46.99592
## 34 BLA00904 46.99712
## 35 BLA00905 46.99808
## 36 BLA00601 47.00880
## 37 BLA00602 47.00785
## 38 BLA00710 46.98029
## 39 BLA00712 46.98012
## Longitude_Decimal_Degrees
## 1 -122.9720
## 2 -122.9616
## 3 -122.9752
## 4 -122.9714
## 5 -122.9762
## 6 -122.9883
## 7 -122.9648
## 8 -122.9650
## 9 -122.9846
## 10 -123.0081
## 11 -122.9722
## 12 -122.9694
## 13 -122.9617
## 14 -122.9606
## 15 -122.9570
## 16 -122.9713
## 17 -122.9684
## 18 -122.9892
## 19 -122.9926
## 20 -122.9927
## 21 -122.9925
## 22 -122.9915
## 23 -122.9915
## 24 -122.9914
## 25 -122.9943
## 26 -122.9893
## 27 -122.9687
## 28 -122.9659
## 29 -122.9658
## 30 -122.9694
## 31 -122.9632
## 32 -122.9806
## 33 -122.9839
## 34 -122.9852
## 35 -122.9869
## 36 -122.9764
## 37 -122.9762
## 38 -122.9889
## 39 -122.9902
map <- map %>%
na.omit()
map
## Study_Specific_Location_ID Latitude_Decimal_Degrees
## 1 BLA001 46.99924
## 2 BLA003 46.98910
## 3 BLA004 46.98035
## 4 BLA005 46.97940
## 5 BLA006 47.00784
## 6 BLA007 46.98035
## 7 BLA002 46.98890
## 8 BLA008 47.00980
## 9 BLA009 46.99668
## 10 BLA011 46.92828
## 11 BLA00201 46.98639
## 12 BLA00202 46.98759
## 13 BLA00203 46.99045
## 14 BLA00301 46.98804
## 15 BLA00204 46.99316
## 16 BLA00501 46.97951
## 17 BLA00502 46.97794
## 18 BLA00701 46.98082
## 19 BLA00702 46.97965
## 20 BLA00703 46.97973
## 21 BLA00704 46.97975
## 22 BLA00705 46.97927
## 23 BLA00706 46.97915
## 24 BLA00707 46.97786
## 25 BLA00708 46.97975
## 26 BLA00709 46.98072
## 27 BLA00102 46.99896
## 28 BLA00103 47.00112
## 29 BLA00104 47.00175
## 30 BLA00101 47.00011
## 31 BLA00105 47.00340
## 32 BLA00902 46.99450
## 33 BLA00903 46.99592
## 34 BLA00904 46.99712
## 35 BLA00905 46.99808
## 36 BLA00601 47.00880
## 37 BLA00602 47.00785
## 38 BLA00710 46.98029
## 39 BLA00712 46.98012
## Longitude_Decimal_Degrees
## 1 -122.9720
## 2 -122.9616
## 3 -122.9752
## 4 -122.9714
## 5 -122.9762
## 6 -122.9883
## 7 -122.9648
## 8 -122.9650
## 9 -122.9846
## 10 -123.0081
## 11 -122.9722
## 12 -122.9694
## 13 -122.9617
## 14 -122.9606
## 15 -122.9570
## 16 -122.9713
## 17 -122.9684
## 18 -122.9892
## 19 -122.9926
## 20 -122.9927
## 21 -122.9925
## 22 -122.9915
## 23 -122.9915
## 24 -122.9914
## 25 -122.9943
## 26 -122.9893
## 27 -122.9687
## 28 -122.9659
## 29 -122.9658
## 30 -122.9694
## 31 -122.9632
## 32 -122.9806
## 33 -122.9839
## 34 -122.9852
## 35 -122.9869
## 36 -122.9764
## 37 -122.9762
## 38 -122.9889
## 39 -122.9902
# Create a leaflet map
map1 <- leaflet(map) %>%
addTiles() %>% # Add default OpenStreetMap tiles
addMarkers(lng = ~Longitude_Decimal_Degrees, lat = ~Latitude_Decimal_Degrees, popup = ~paste(map$Study_Specific_Location_ID))
map1
If coordinates are now correct, overwrite the csv:
write.csv(df, file="O:/EH_Health/Surface Water/+ PIC/Projects/Black Lake Grant 2022-25/Data/EIM_Black_Lake_PIC_Project_Data.csv", row.names=FALSE)
Result Parameter CAS Number
# find location of missing values
print("Position of missing values ")
## [1] "Position of missing values "
which(is.na(df$Result_Parameter_CAS_Number))
## integer(0)
# count total missing values
print("Count of total missing values")
## [1] "Count of total missing values"
sum(df$Result_Parameter_CAS_Number== "")
## [1] 215
215 rows are missing Result Parameter CAS Number. This should be corrected
Result Data Qualifier When Result Value = 2420, qualifier should be “G” unless “REJ”
underestimates <- df %>% filter(df$Result_Value==2420)
qualifier_table <- underestimates %>% select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value, Result_Data_Qualifier)
print(qualifier_table)
## Study_Specific_Location_ID Field_Collection_Start_Date Result_Value
## 1 BLA006 7/5/2023 2420
## 2 BLA00301 8/31/2023 2420
## 3 BLA00301 9/21/2023 2420
## 4 BLA00705 2/12/2024 2420
## 5 BLA007 2/28/2024 2420
## Result_Data_Qualifier
## 1 G
## 2 G
## 3 G
## 4 G
## 5 G
Add qualifier G to any 2420 records without it.
When Result Value = 1, qualifier should be “E” unless “REJ”
overestimates <- df %>% filter(df$Result_Value==1)
qualifier_tablea <- overestimates %>% select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value, Result_Data_Qualifier)
print(qualifier_tablea)
## Study_Specific_Location_ID Field_Collection_Start_Date Result_Value
## 1 BLA008 12/14/2023 1
## 2 BLA00705 1/29/2024 1
## 3 BLA00702 2/20/2024 1
## 4 BLA00703 2/20/2024 1
## 5 BLA00102 1/20/2024 1
## 6 BLA008 3/27/2024 1
## 7 BLA00712 4/1/2024 1
## Result_Data_Qualifier
## 1 E
## 2 E
## 3 E
## 4 E
## 5 REJ
## 6 REJ
## 7 REJ
All records need qualifier “E” added
Calculate the number of QA samples taken. The result should be 0.10 or greater (at least 10% of the total).
#After identifying REJ records, run the following code again:
#df_REJ <- df[df$Result_Data_Qualifier != "REJ", ]
replicate_flag_counts <- df %>%
summarise(
total_count = n(),
replicate_count = sum(Sample_Replicate_Flag == "Y"),
replicate_proportion = replicate_count / total_count
)
replicate_flag_counts
## total_count replicate_count replicate_proportion
## 1 532 60 0.112782
Based on results, provide a recommendation to the team to increase/decrease/maintain QA sample frequency.
Implementation Anomalies Check that sample values are within expected range. If no records are returned, no values are out of expected range. Any values that are returned, please review and accept/reject accordingly.
##Check expected values
#E.coli is expected to be between 1 and 2419.6
#TP is expected to be between 0.005 and 1000
# Filter records for Total Phosphorus and E. coli that don't meet the criteria
invalid_records <- df %>%
filter(
(Result_Parameter_Name == "Total Phosphorus" & (Result_Value < 0.005 | Result_Value > 1000)) |
(Result_Parameter_Name == "E. coli" & (Result_Value < 1 | Result_Value > 2420))
)
# Display the records that fall out of bounds
invalid_records
## [1] Ã.........Study_ID Location_ID
## [3] Study_Specific_Location_ID Field_Collection_Type
## [5] Field_Collector Field_Collection_Start_Date
## [7] Field_Collection_Comment Latitude_Decimal_Degrees
## [9] Longitude_Decimal_Degrees Sample_ID
## [11] Sample_Field_Replicate_ID Sample_Replicate_Flag
## [13] Sample_Composite_Flag Sample_Matrix
## [15] Sample_Source Sample_Collection_Method
## [17] Sample_Preparation_Method Result_Parameter_Name
## [19] Result_Parameter_CAS_Number Lab_Analysis_Date
## [21] Result_Value Result_Value_Units
## [23] Result_Reporting_Limit Result_Reporting_Limit_Type
## [25] Result_Detection_Limit Result_Detection_Limit_Type
## [27] Result_Data_Qualifier Fraction_Analyzed
## [29] Result_Method Result_Lab_Name
## <0 rows> (or 0-length row.names)
Relative Percent Difference
Check the Relative Percent Difference (RPD) for E. coli and phosphorus samples.Samples outside defined thresholds may be rejected.
\[ RPD = \frac{\lvert{R1-R2}\rvert}{\frac{R1+R2}{2}}\times100 \] Total Phosphorus: TP QA sample values must be within 20% of the sample. OR if TP<0.025, sample is accepted even if they fall outside the 20% range.
# Filter dataframe by result_parameter_name = 'Total Phosphorus'
total_phosphorus_data <- df %>%
filter(Result_Parameter_Name == "Total Phosphorus")
# Filter for Sample_Replicate_Flag == "Y"
replicate_Y_data <- total_phosphorus_data %>%
filter(Sample_Replicate_Flag == "Y")
# Pair rows for Sample_Replicate_Flag == "Y" with Sample_Replicate_Flag == "N"
paired_data <- replicate_Y_data %>%
left_join(total_phosphorus_data %>%
filter(Sample_Replicate_Flag == "N"),
by = c("Study_Specific_Location_ID", "Field_Collection_Start_Date"),
suffix = c("_Y", "_N"))
# Calculate the relative percent difference between the two paired values
paired_data <- paired_data %>%
mutate(relative_percent_difference = abs(Result_Value_Y - Result_Value_N) / ((Result_Value_Y + Result_Value_N) / 2) * 100)
# Return a table showing both paired values, Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value, and calculated relative percent difference values
TP_result_table <- paired_data %>%
select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value_Y, Result_Value_N, relative_percent_difference)
# Print the table
print(TP_result_table)
## Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1 BLA003 7/12/2023 0.041
## 2 BLA004 7/20/2023 0.041
## 3 BLA005 7/25/2023 0.039
## 4 BLA007 9/14/2023 0.039
## 5 BLA008 10/12/2023 0.015
## 6 BLA009 10/12/2023 0.024
## 7 BLA001 12/14/2023 0.028
## 8 BLA009 12/14/2023 0.015
## 9 BLA002 1/11/2024 0.028
## 10 BLA004 1/11/2024 0.018
## 11 BLA009 1/11/2024 0.018
## 12 BLA00301 1/16/2024 0.039
## 13 BLA00702 1/29/2024 0.016
## 14 BLA002 1/29/2024 0.031
## 15 BLA00204 1/9/2024 0.043
## 16 BLA00705 1/22/2024 0.022
## 17 BLA00202 1/22/2024 0.032
## 18 BLA00201 2/5/2024 0.028
## 19 BLA007 2/5/2024 0.024
## 20 BLA00706 2/12/2024 0.013
## 21 BLA00101 2/12/2024 0.024
## 22 BLA00101 2/12/2024 0.024
## 23 BLA00102 2/20/2024 0.006
## 24 BLA002 2/28/2024 0.034
## 25 BLA003 2/28/2024 0.032
## 26 BLA004 2/28/2024 0.023
## 27 BLA001 3/4/2024 0.020
## Result_Value_N relative_percent_difference
## 1 0.041 0.000000
## 2 0.041 0.000000
## 3 0.050 24.719101
## 4 0.041 5.000000
## 5 0.015 0.000000
## 6 0.023 4.255319
## 7 0.030 6.896552
## 8 0.014 6.896552
## 9 0.030 6.896552
## 10 0.018 0.000000
## 11 NA NA
## 12 0.034 13.698630
## 13 0.015 6.451613
## 14 0.031 0.000000
## 15 0.043 0.000000
## 16 0.022 0.000000
## 17 0.032 0.000000
## 18 0.026 7.407407
## 19 0.023 4.255319
## 20 0.014 7.407407
## 21 0.023 4.255319
## 22 0.015 46.153846
## 23 0.006 0.000000
## 24 NA NA
## 25 0.033 3.076923
## 26 0.022 4.444444
## 27 0.020 0.000000
# Assign a group to each row based on the average of the paired Result_Value
TP_result_table <- TP_result_table %>%
mutate(
group = ifelse((Result_Value_Y + Result_Value_N) / 2 < 0.025, "accepted", "evaluate"),
highlight = ifelse(relative_percent_difference > 20, "highlight", "")
)
# Print the table with group and highlight columns
print(TP_result_table)
## Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1 BLA003 7/12/2023 0.041
## 2 BLA004 7/20/2023 0.041
## 3 BLA005 7/25/2023 0.039
## 4 BLA007 9/14/2023 0.039
## 5 BLA008 10/12/2023 0.015
## 6 BLA009 10/12/2023 0.024
## 7 BLA001 12/14/2023 0.028
## 8 BLA009 12/14/2023 0.015
## 9 BLA002 1/11/2024 0.028
## 10 BLA004 1/11/2024 0.018
## 11 BLA009 1/11/2024 0.018
## 12 BLA00301 1/16/2024 0.039
## 13 BLA00702 1/29/2024 0.016
## 14 BLA002 1/29/2024 0.031
## 15 BLA00204 1/9/2024 0.043
## 16 BLA00705 1/22/2024 0.022
## 17 BLA00202 1/22/2024 0.032
## 18 BLA00201 2/5/2024 0.028
## 19 BLA007 2/5/2024 0.024
## 20 BLA00706 2/12/2024 0.013
## 21 BLA00101 2/12/2024 0.024
## 22 BLA00101 2/12/2024 0.024
## 23 BLA00102 2/20/2024 0.006
## 24 BLA002 2/28/2024 0.034
## 25 BLA003 2/28/2024 0.032
## 26 BLA004 2/28/2024 0.023
## 27 BLA001 3/4/2024 0.020
## Result_Value_N relative_percent_difference group highlight
## 1 0.041 0.000000 evaluate
## 2 0.041 0.000000 evaluate
## 3 0.050 24.719101 evaluate highlight
## 4 0.041 5.000000 evaluate
## 5 0.015 0.000000 accepted
## 6 0.023 4.255319 accepted
## 7 0.030 6.896552 evaluate
## 8 0.014 6.896552 accepted
## 9 0.030 6.896552 evaluate
## 10 0.018 0.000000 accepted
## 11 NA NA <NA> <NA>
## 12 0.034 13.698630 evaluate
## 13 0.015 6.451613 accepted
## 14 0.031 0.000000 evaluate
## 15 0.043 0.000000 evaluate
## 16 0.022 0.000000 accepted
## 17 0.032 0.000000 evaluate
## 18 0.026 7.407407 evaluate
## 19 0.023 4.255319 accepted
## 20 0.014 7.407407 accepted
## 21 0.023 4.255319 accepted
## 22 0.015 46.153846 accepted highlight
## 23 0.006 0.000000 accepted
## 24 NA NA <NA> <NA>
## 25 0.033 3.076923 evaluate
## 26 0.022 4.444444 accepted
## 27 0.020 0.000000 accepted
#Values highlighted in table are REJ
RPD for samples needs to be evaluated in context. Results returned in the table as “accepted” will be accepted regardless of RPD because the sample concentrations are close to the detection limit.
Those labeled “evaluate” should be assessed based on both the RPD value and what is known about the site heterogeneity.
BLA005, collected on 7/25/2023, does not meet quality standards and will be removed from the analysis.
E. coli: Bacteria samples with low counts tend to have higher variability. Therefore, EC sample pairs (sample and field duplicate) will be separated into two groups:
• “low count samples” where the pair mean ≤ 20 MPN/100 mL and • “higher count samples” where the pair mean > 20 MPN/100 mL.
For precision of bacteria field replicates: • 50% of the replicate pairs must be at or below 20% RPD • 90% of the pairs must be at or below 50% RPD
# Filter for E. coli
ecoli_data <- subset(df, Result_Parameter_Name == 'E. coli')
#Filter for records without REJ quaifier (to be done after running with entire dataset)
#ecoli_data <- ecoli_data[ecoli_data$Result_Data_Qualifier != "REJ", ]
# Filter for Sample_Replicate_Flag == "Y"
replicate_Y_data <- ecoli_data %>%
filter(Sample_Replicate_Flag == "Y")
# Pair rows for Sample_Replicate_Flag == "Y" with Sample_Replicate_Flag == "N"
paired_df <- replicate_Y_data %>%
left_join(ecoli_data %>%
filter(Sample_Replicate_Flag == "N"),
by = c("Study_Specific_Location_ID", "Field_Collection_Start_Date"),
suffix = c("_Y", "_N"))
# Calculate the relative percent difference between the two paired values
paired_df <- paired_df %>%
mutate(relative_percent_difference = abs(Result_Value_Y - Result_Value_N) / ((Result_Value_Y + Result_Value_N) / 2) * 100)
EC_result_table <- paired_df %>%
select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value_Y, Result_Value_N, relative_percent_difference)
# Return the table
EC_result_table
## Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1 BLA003 7/12/2023 410
## 2 BLA004 7/20/2023 365
## 3 BLA005 7/25/2023 63
## 4 BLA00203 9/14/2023 186
## 5 BLA007 9/14/2023 816
## 6 BLA008 10/12/2023 4
## 7 BLA009 10/12/2023 127
## 8 BLA00502 11/27/2023 3
## 9 BLA001 12/14/2023 15
## 10 BLA009 12/14/2023 9
## 11 BLA00204 1/9/2024 29
## 12 BLA002 1/11/2024 49
## 13 BLA004 1/11/2024 8
## 14 BLA002 1/22/2024 51
## 15 BLA00301 1/16/2024 93
## 16 BLA002 1/29/2024 3
## 17 BLA00201 2/5/2024 20
## 18 BLA00705 1/22/2024 44
## 19 BLA00702 1/29/2024 2
## 20 BLA00706 2/12/2024 206
## 21 BLA00102 1/20/2024 4
## 22 BLA00101 2/12/2024 41
## 23 BLA001 3/4/2024 19
## 24 BLA00103 3/11/2024 3
## 25 BLA003 2/28/2024 260
## 26 BLA004 2/28/2024 77
## 27 BLA00902 3/18/2024 5
## 28 BLA00902 3/25/2024 4
## 29 BLA005 3/27/2024 20
## 30 BLA008 3/27/2024 4
## 31 BLA00602 4/1/2024 126
## 32 BLA00712 4/1/2024 31
## 33 BLA007 4/8/2024 276
## 34 BLA00602 4/15/2024 24
## Result_Value_N relative_percent_difference
## 1 652 45.574388
## 2 236 42.928453
## 3 81 25.000000
## 4 214 14.000000
## 5 980 18.262806
## 6 6 40.000000
## 7 27 129.870130
## 8 3 0.000000
## 9 10 40.000000
## 10 9 0.000000
## 11 52 56.790123
## 12 45 8.510638
## 13 10 22.222222
## 14 46 10.309278
## 15 111 17.647059
## 16 6 66.666667
## 17 26 26.086957
## 18 56 24.000000
## 19 4 66.666667
## 20 166 21.505376
## 21 1 120.000000
## 22 46 11.494253
## 23 20 5.128205
## 24 12 120.000000
## 25 238 8.835341
## 26 50 42.519685
## 27 5 0.000000
## 28 5 22.222222
## 29 28 33.333333
## 30 1 120.000000
## 31 99 24.000000
## 32 1 187.500000
## 33 184 40.000000
## 34 28 15.384615
# Assign a group based on the average of the Result_Value_Replicate_Y and Result_Value_Replicate_N
EC_result_table$group <- ifelse((EC_result_table$Result_Value_Y + EC_result_table$Result_Value_N) / 2 <= 20,
"low count samples", "higher count samples")
# Calculate the percent of values with Relative_Percent_Difference less than or equal to 20 by group
percent_diff_20 <- with(EC_result_table, tapply(relative_percent_difference <= 20, group, mean) * 100)
# Calculate the percent of values with Relative_Percent_Difference less than or equal to 50 by group
percent_diff_50 <- with(EC_result_table, tapply(relative_percent_difference <= 50, group, mean) * 100)
# Combine the results into a data frame
percent_diff_results <- data.frame(
Group = c("low count samples", "higher count samples"),
Percent_Diff_LE_20 = c(percent_diff_20["low count samples"], percent_diff_20["higher count samples"]),
Percent_Diff_LE_50 = c(percent_diff_50["low count samples"], percent_diff_50["higher count samples"])
)
# Return the results
percent_diff_results
## Group Percent_Diff_LE_20 Percent_Diff_LE_50
## low count samples low count samples 28.57143 57.14286
## higher count samples higher count samples 40.00000 90.00000
# • 50% of the replicate pairs must be at or below 20% RPD
# • 90% of the pairs must be at or below 50% RPD
Determine which replicates are invalid. Investigate corrective actions.
50% of replicate pairs must be at or below 20% RPD. Currently, 40% of low count samples area and 37.5% of high count samples are.
90% of replicate pairs must be at or below 50% RPD. Currently, 100% of low count samples are and 75% of high count samples are.
Low count samples with non-anomalous larger RPD values will be included in analyses for the time being because the data set is currently incomplete and when more replicate samples have been collected, the percent of samples meeting the thresholds may meet quality standards. A final determination will be made when all data have been collected and the project concluded in 2025.
Analysis should be re-run with the corrected dataframe to see how quality assessment improves.
Prep data: Remove all duplicate records and any other rejected records:
# Create a new dataframe with the specified conditions
new_df <- df[!(df$Result_Data_Qualifier == "REJ" | df$Sample_Replicate_Flag == "Y"), ]
Define the date field so it may be recognized as a linear measure of time:
##Convert date class
new_df$Field_Collection_Start_Date
new_df <- new_df %>%
mutate(Field_Collection_Start_Date = mdy(Field_Collection_Start_Date))
head(new_df)
new_df$Field_Collection_Start_Date
Add a new field, “Season”, defined by the Field_Collection_Start_Date:
new_df$Season <- ifelse(format(as.Date(new_df$Field_Collection_Start_Date), "%m") %in% c("05", "06", "07", "08", "09"), "Dry", "Wet")
new_df
Save the prepped dataset as an object
save(new_df, file="J:/Git_WS/R-Code-Sample/Inputs/new_df.rda")
dir.create("O:/EH_Health/Surface Water/+ PIC/Black Lake Grant 2022-25/Data", recursive = TRUE, showWarnings = FALSE)
write.csv(new_df, file="O:/EH_Health/Surface Water/+ PIC/Black Lake Grant 2022-25/Data/Black_Lake_Data_Prepped.csv", row.names=FALSE)
Statistical quantities include measures of: - central tendency (mean, median, etc) - Relative standing (percentiles) - Dispersion (range, variance) - Association (correlation)
Review these quantities to determine: - Do the data look reasonable - do the values make sense? - Are there any obvious anomalies? - Are there any trends or patterns?
Look at numeric distributions:
# Plot numeric distributions of Result_Value by Result_Parameter_Name
ggplot(new_df, aes(x = Result_Value)) +
geom_histogram(bins = 30) +
facet_wrap(~Result_Parameter_Name, scales = "free_x") +
theme_minimal() +
xlab("Result Value") +
ylab("Frequency") +
ggtitle("Distributions of Result Value by Parameter Name")
##check normality
# Check the normality of Result_Value by Result_Parameter_Name using QQ plots
ggplot(new_df, aes(sample = Result_Value)) +
geom_qq() +
geom_qq_line() +
facet_wrap(~Result_Parameter_Name) +
theme_minimal() +
xlab("Theoretical Quantiles") +
ylab("Sample Quantiles") +
ggtitle("QQ Plots of Result Value by Parameter Name")
Calculate some preliminary statistics.
Return the number of samples taken by site:
#how many samples were taken?
nrow(new_df)
## [1] 461
#461
# Make table summary
summary <- new_df %>%
summarise(
`Total number E. coli Samples Taken` = sum(Result_Parameter_Name == "E. coli"),
`Total number of Total Phosphorus Samples Taken` = sum(Result_Parameter_Name == "Total Phosphorus"),
)
summary
## Total number E. coli Samples Taken
## 1 273
## Total number of Total Phosphorus Samples Taken
## 1 188
samplexsite <- table(new_df$Result_Parameter_Name, new_df$Study_Specific_Location_ID)
samplexsite
##
## BLA001 BLA00101 BLA00102 BLA00103 BLA00104 BLA00105 BLA002
## E. coli 18 4 3 3 4 4 19
## Total Phosphorus 17 5 4 4 4 3 15
##
## BLA00201 BLA00202 BLA00203 BLA00204 BLA003 BLA00301 BLA004
## E. coli 9 9 9 8 21 9 16
## Total Phosphorus 5 5 5 5 18 5 13
##
## BLA005 BLA00501 BLA00502 BLA006 BLA00601 BLA00602 BLA007
## E. coli 17 3 3 13 5 3 21
## Total Phosphorus 12 0 0 7 0 0 18
##
## BLA00701 BLA00702 BLA00703 BLA00704 BLA00705 BLA00706
## E. coli 1 3 3 7 4 4
## Total Phosphorus 1 5 3 5 5 5
##
## BLA00707 BLA00708 BLA00709 BLA00710 BLA00712 BLA008 BLA009
## E. coli 4 1 1 3 2 7 12
## Total Phosphorus 5 1 0 0 0 7 6
##
## BLA00902 BLA00903 BLA00904 BLA00905 BLA011
## E. coli 5 5 5 3 2
## Total Phosphorus 0 0 0 0 0
Calculate the geomean by routine site and season for E. coli:
#Remove all Segmentation samples and filter for E. coli only
df_filtered <- new_df %>%
filter(Field_Collection_Comment != "Segmentation", Result_Parameter_Name == "E. coli")
# Filter for "Wet" and "Dry" seasons only
df_season_filtered <- df_filtered %>%
filter(Season %in% c("Wet", "Dry"))
# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))
# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df <- df_season_filtered %>%
group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
summarise(
Geomean = geomean(Result_Value),
`Total Number of Samples` = n(),
`% of Samples above 100` = sum(Result_Value >= 100) / n() * 100,
`% of Samples above 320` = sum(Result_Value >= 320) / n() * 100,
.groups = 'drop'
)
aggregated_df
## # A tibble: 19 × 8
## Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
## <chr> <chr> <dbl> <dbl>
## 1 BLA001 Dry 47.0 -123.
## 2 BLA001 Wet 47.0 -123.
## 3 BLA002 Dry 47.0 -123.
## 4 BLA002 Wet 47.0 -123.
## 5 BLA003 Dry 47.0 -123.
## 6 BLA003 Wet 47.0 -123.
## 7 BLA004 Dry 47.0 -123.
## 8 BLA004 Wet 47.0 -123.
## 9 BLA005 Dry 47.0 -123.
## 10 BLA005 Wet 47.0 -123.
## 11 BLA006 Dry 47.0 -123.
## 12 BLA006 Wet 47.0 -123.
## 13 BLA007 Dry 47.0 -123.
## 14 BLA007 Wet 47.0 -123.
## 15 BLA008 Dry 47.0 -123.
## 16 BLA008 Wet 47.0 -123.
## 17 BLA009 Dry 47.0 -123.
## 18 BLA009 Wet 47.0 -123.
## 19 BLA011 Dry 46.9 -123.
## # ℹ abbreviated names: ¹Study_Specific_Location_ID, ²Latitude_Decimal_Degrees,
## # ³Longitude_Decimal_Degrees
## # ℹ 4 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## # `% of Samples above 100` <dbl>, `% of Samples above 320` <dbl>
Calculate the geomean by routine site and season for Total Phosphorus:
df_tp <- new_df %>%
filter(Field_Collection_Comment != "Segmentation", Result_Parameter_Name == "Total Phosphorus")
# Filter for "Wet" and "Dry" seasons only
df_season_tp <- df_tp %>%
filter(Season %in% c("Wet", "Dry"))
# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))
# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df_tp <- df_season_tp %>%
group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
summarise(
Geomean = geomean(Result_Value),
`Total Number of Samples` = n(),
`% of Samples above 0.01mg/L` = sum(Result_Value >= 0.01) / n() * 100,
.groups = 'drop'
)
aggregated_df_tp
## # A tibble: 18 × 7
## Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
## <chr> <chr> <dbl> <dbl>
## 1 BLA001 Dry 47.0 -123.
## 2 BLA001 Wet 47.0 -123.
## 3 BLA002 Dry 47.0 -123.
## 4 BLA002 Wet 47.0 -123.
## 5 BLA003 Dry 47.0 -123.
## 6 BLA003 Wet 47.0 -123.
## 7 BLA004 Dry 47.0 -123.
## 8 BLA004 Wet 47.0 -123.
## 9 BLA005 Dry 47.0 -123.
## 10 BLA005 Wet 47.0 -123.
## 11 BLA006 Dry 47.0 -123.
## 12 BLA006 Wet 47.0 -123.
## 13 BLA007 Dry 47.0 -123.
## 14 BLA007 Wet 47.0 -123.
## 15 BLA008 Dry 47.0 -123.
## 16 BLA008 Wet 47.0 -123.
## 17 BLA009 Dry 47.0 -123.
## 18 BLA009 Wet 47.0 -123.
## # ℹ abbreviated names: ¹Study_Specific_Location_ID, ²Latitude_Decimal_Degrees,
## # ³Longitude_Decimal_Degrees
## # ℹ 3 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## # `% of Samples above 0.01mg/L` <dbl>
#add Result_parameter_name column
aggregated_df_tp$Result_Parameter_Name <- "Total Phosphorus"
#add Meets_Standards column
aggregated_df_tp$`Meets_Standards` <- ifelse(aggregated_df_tp$Geomean < 0.01, "Yes", "No")
Calculate E. coli geomean for segmented sites by season:
#Repeat for segmented sites
df_seg <- new_df %>%
filter(Field_Collection_Comment == "Segmentation", Result_Parameter_Name == "E. coli")
# Filter for "Wet" and "Dry" seasons only
df_seg_filtered <- df_seg %>%
filter(Season %in% c("Wet", "Dry"))
# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))
# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df_seg <- df_seg_filtered %>%
group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
summarise(
Geomean = geomean(Result_Value),
`Total Number of Samples` = n(),
`% of Samples above 100` = sum(Result_Value >= 100) / n() * 100,
`% of Samples above 320` = sum(Result_Value >= 320) / n() * 100,
.groups = 'drop'
)
aggregated_df_seg
## # A tibble: 44 × 8
## Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
## <chr> <chr> <dbl> <dbl>
## 1 BLA001 Wet 47.0 -123.
## 2 BLA00101 Wet 47.0 -123.
## 3 BLA00102 Wet 47.0 -123.
## 4 BLA00103 Wet 47.0 -123.
## 5 BLA00104 Wet 47.0 -123.
## 6 BLA00105 Wet 47.0 -123.
## 7 BLA002 Dry 47.0 -123.
## 8 BLA002 Wet 47.0 -123.
## 9 BLA00201 Dry 47.0 -123.
## 10 BLA00201 Wet 47.0 -123.
## # ℹ 34 more rows
## # ℹ abbreviated names: ¹Study_Specific_Location_ID, ²Latitude_Decimal_Degrees,
## # ³Longitude_Decimal_Degrees
## # ℹ 4 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## # `% of Samples above 100` <dbl>, `% of Samples above 320` <dbl>
Now merge geomean tables into a dataframe to be used in the dashboard and save it as a csv
#Merge E.coli tables
Black_Lake_Geomean_Table <- rbind(aggregated_df, aggregated_df_seg)
#Add columns "Result_Parameter_Name" and "Meets standards"
Black_Lake_Geomean_Table$Result_Parameter_Name <- "E. coli"
Black_Lake_Geomean_Table$`Meets_Standards` <- ifelse(Black_Lake_Geomean_Table$Geomean < 100 | Black_Lake_Geomean_Table$`% of Samples above 320` < 10, "Yes", "No")
#Change data type of Black Lake Geomean 'Meets Standards'
Black_Lake_Geomean_Table$Meets_Standards <- as.character(Black_Lake_Geomean_Table$Meets_Standards)
#Merge TP table
Black_Lake_Geomean_Table <-merge(Black_Lake_Geomean_Table, aggregated_df_tp, all.x = TRUE)
Black_Lake_Geomean_Table <- bind_rows(Black_Lake_Geomean_Table, aggregated_df_tp)
#Export to csv
write.csv(Black_Lake_Geomean_Table, file="O:/EH_Health/Surface Water/+ PIC/Projects/Black Lake Grant 2022-25/Data/Support Data/Black_Lake_Geomean_Table.csv", row.names=FALSE)
Review graphs to determine: - Do the data look reasonable? - What is the distribution like? Is it symmetric, bimodal, etc? - Are there extremely high or extremely low values? - Are there any obvious trends?
Create a boxplot using Plotly for E. coli. The plot is interactive.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
Create a boxplot using Plotly for Total Phosphorus
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
Create plots for each E.coli and Total Phosphorus results over time. The plots are interactive and you can click on the series to the right to hide or show them. Static pngs are also available for download.
library("viridis")
## Warning: package 'viridis' was built under R version 4.3.3
## Loading required package: viridisLite
# Filter the data for the specific location ID and parameter name
df_BLA001_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA001", ]
# Create the line graph
ggplot(df_BLA001_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA001") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 120, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA001_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA001", ]
# Create the line graph
ggplot(df_BLA001_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA001") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
annotate("text", x = as.Date("2024-05-01"), y = 0.005, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`
# Filter the data for the specific location ID and parameter name
df_BLA002_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA002", ]
# Create the line graph
ggplot(df_BLA002_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA002") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 120, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA002_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA002", ]
# Create the line graph
ggplot(df_BLA002_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA002") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
annotate("text", x = as.Date("2024-05-01"), y = 0.012, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`
# Filter the data for the specific location ID and parameter name
df_BLA003_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA003", ]
# Create the line graph
ggplot(df_BLA003_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA003") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 140, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA003_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA003", ]
# Create the line graph
ggplot(df_BLA003_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA003") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
annotate("text", x = as.Date("2024-05-01"), y = 0.012, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`
# Filter the data for the specific location ID and parameter name
df_BLA004_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA004", ]
# Create the line graph
ggplot(df_BLA004_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA004") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 120, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA004_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA004", ]
# Create the line graph
ggplot(df_BLA004_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA004") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
annotate("text", x = as.Date("2024-05-01"), y = 0.012, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`
# Filter the data for the specific location ID and parameter name
df_BLA005_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA005", ]
# Create the line graph
ggplot(df_BLA005_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA005") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 145, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA005_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA005", ]
# Create the line graph
ggplot(df_BLA005_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA005") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_hline(yintercept = 0.01, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 0.011, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
# Filter the data for the specific location ID and parameter name
df_BLA006_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA006", ]
# Create the line graph
ggplot(df_BLA006_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA006") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 150, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA006_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA006", ]
# Create the line graph
ggplot(df_BLA006_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA006") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
annotate("text", x = as.Date("2024-05-01"), y = 0.016, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`
# Filter the data for the specific location ID and parameter name
df_BLA007_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA007", ]
# Create the line graph
ggplot(df_BLA007_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA007") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 155, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA007_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA007", ]
# Create the line graph
ggplot(df_BLA007_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA007") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
annotate("text", x = as.Date("2024-05-01"), y = 0.012, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`
# Filter the data for the specific location ID and parameter name
df_BLA009_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA009", ]
# Create the line graph
ggplot(df_BLA009_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA009") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 120, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA009_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA009", ]
# Create the line graph
ggplot(df_BLA009_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA009") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_hline(yintercept = 0.01, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 0.011, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
# Filter the data for the specific location ID and parameter name
df_BLA008_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA008", ]
# Create the line graph
ggplot(df_BLA008_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "E. coli (MPN/100mL)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA008") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_hline(yintercept = 100, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 95, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA008_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA008", ]
# Create the line graph
ggplot(df_BLA008_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
geom_line(linewidth = 0.9, color = "darkcyan") +
labs(x = "Month", y = "Total Phosphorus (mg/L)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
ggtitle("Concentration of Total Phosphorus in Streamwater at BLA008") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
geom_hline(yintercept = 0.01, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 0.0105, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1) +
geom_hline(yintercept = 0.0195, color = "darkorange3") +
annotate("text", x = as.Date("2024-05-01"), y = 0.02, label = "TMDL Target", color = "darkorange3", size = 3.5, hjust = 1)
# Load the necessary library
#library(dplyr)
# Filter for relevant parameters
filtered_df <- new_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus"))
# Create pairs based on Field_Collection_Start_Date and Study_Specific_Location_ID
paired_data <- filtered_df %>% group_by(Study_Specific_Location_ID, Field_Collection_Start_Date) %>% filter(n() >= 2) %>% pivot_wider(names_from = Result_Parameter_Name, values_from = Result_Value)
# Remove groups that do not have both parameters
paired_data <- paired_data %>%
filter(!is.na(`E. coli`) & !is.na(`Total Phosphorus`))
valid_data <- paired_data %>% group_by(Study_Specific_Location_ID) %>% filter(n() > 1) %>% summarise(correlation = ifelse(n() > 1, cor(`E. coli`, `Total Phosphorus`, use = "complete.obs"), NA), p_value = ifelse(n() > 1, cor.test(`E. coli`, `Total Phosphorus`, method = "pearson")$p.value, NA))
# Test for association within each group
#association_tests <- paired_data %>%
# group_by(Study_Specific_Location_ID) %>%
#summarize(correlation = cor(`E. coli`, `Total Phosphorus`, use = "complete.obs"),
# p_value = cor.test(`E. coli`, `Total Phosphorus`, method = "pearson")$p.value)
# Output the results
#print(association_tests)
print(valid_data)
## # A tibble: 0 × 3
## # ℹ 3 variables: Study_Specific_Location_ID <chr>, correlation <lgl>,
## # p_value <lgl>
# Filter dataframe for Study_Specific_Location_ID = BLA001
BLA001_df <- new_df %>%
filter(Study_Specific_Location_ID == "BLA001")
# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA001_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
group_by(Field_Collection_Start_Date) %>%
summarise(
E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
) %>%
filter(E_coli > 0 & Total_Phosphorus > 0)
# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")
# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef),
hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)",
title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA001)")
## `geom_smooth()` using formula = 'y ~ x'
# Filter dataframe for Study_Specific_Location_ID = BLA002
BLA002_df <- new_df %>%
filter(Study_Specific_Location_ID == "BLA002")
# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA002_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
group_by(Field_Collection_Start_Date) %>%
summarise(
E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
) %>%
filter(E_coli > 0 & Total_Phosphorus > 0)
# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")
# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef),
hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)",
title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA002)")
## `geom_smooth()` using formula = 'y ~ x'
# Filter dataframe for Study_Specific_Location_ID = BLA003
BLA003_df <- new_df %>%
filter(Study_Specific_Location_ID == "BLA003")
# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA003_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
group_by(Field_Collection_Start_Date) %>%
summarise(
E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
) %>%
filter(E_coli > 0 & Total_Phosphorus > 0)
# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")
# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef),
hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)",
title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA003)")
## `geom_smooth()` using formula = 'y ~ x'
# Filter dataframe for Study_Specific_Location_ID = BLA004
BLA004_df <- new_df %>%
filter(Study_Specific_Location_ID == "BLA004")
# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA004_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
group_by(Field_Collection_Start_Date) %>%
summarise(
E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
) %>%
filter(E_coli > 0 & Total_Phosphorus > 0)
# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")
# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef),
hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)",
title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA004)")
## `geom_smooth()` using formula = 'y ~ x'
# Filter dataframe for Study_Specific_Location_ID = BLA005
BLA005_df <- new_df %>%
filter(Study_Specific_Location_ID == "BLA005")
# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA005_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
group_by(Field_Collection_Start_Date) %>%
summarise(
E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
) %>%
filter(E_coli > 0 & Total_Phosphorus > 0)
# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")
# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef),
hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)",
title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA005)")
## `geom_smooth()` using formula = 'y ~ x'
# Filter dataframe for Study_Specific_Location_ID = BLA006
BLA006_df <- new_df %>%
filter(Study_Specific_Location_ID == "BLA006")
# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA006_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
group_by(Field_Collection_Start_Date) %>%
summarise(
E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
) %>%
filter(E_coli > 0 & Total_Phosphorus > 0)
# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")
# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef),
hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)",
title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA006)")
## `geom_smooth()` using formula = 'y ~ x'
# Filter dataframe for Study_Specific_Location_ID = BLA007
BLA007_df <- new_df %>%
filter(Study_Specific_Location_ID == "BLA002")
# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA007_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
group_by(Field_Collection_Start_Date) %>%
summarise(
E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
) %>%
filter(E_coli > 0 & Total_Phosphorus > 0)
# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")
# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef),
hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)",
title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA007)")
## `geom_smooth()` using formula = 'y ~ x'
# Filter dataframe for Study_Specific_Location_ID = BLA009
BLA009_df <- new_df %>%
filter(Study_Specific_Location_ID == "BLA009")
# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA009_df %>%
filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
group_by(Field_Collection_Start_Date) %>%
summarise(
E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
) %>%
filter(E_coli > 0 & Total_Phosphorus > 0)
# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")
# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef),
hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)",
title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA009)")
## `geom_smooth()` using formula = 'y ~ x'
From the review of this report, we should gain a sense of data quality. Our prepped dataset should now meet our acceptance criteria.
Further assessment still needs to be done: Data should be of appropriate quality to meet both our performance and acceptance criteria. Performance criteria needs to be established and appropriate amendments and analyses should follow before decision-making.
knitr::opts_chunk$set(echo = TRUE)